File

sample_gt_cnvEst.Rmd

Index

Overview

Genotype data (snp, chr, pos, donor.gt, donor.lrr, donor.baf) from samples that are either disomic (two copies) or trisomic (three copies) for human chromosome 21 (HSA21). Identify contiguous intra-chromosomal segments by calculating significantly differential average LRR. Smooth these segments and utilize mapped regions to calculate segmental duplication and mosacism. 1. Estimate segmental duplication by calculating the proportion of HSA21 chromosome with average LRR indicative of triplication (> 0.2). 2. Estimate the fraction of triplicated/mosaic cells by BAF heterozygosity.

Metrics

LogRRatio (LRR): log2(observed signal intensity/expected intensity) with expected based on set of reference samples with normal copy number of 2. Perfect trisomic LRR = log2(3/2) = log2(1.5) = 0.585

B Allele Frequency: Fraction of intensity from the B allele of the total allelic intensity for each SNP locus (B/A+B). Diploid frequencies: AA = 0, AB = 0.5, BB = 1. Triploid frequencies: AAA = 0, AAB = 0.33, ABB = 0.67, BBB=1.

Environment Initialization

Initialize Packages

#Data Manipulation
library(dplyr)
library(stringr)
library(purrr)
library(data.table)

#Plotting
library(ggplot2)

#Segment
library(changepoint)
library(segmented)
#library(bcp)

#DNA Copy
library(DNAcopy)

#RMarkdown
library(knitr)
library(kableExtra)

Initialize Directories (harddrive)

out.dir <- paste0("./Ts21_trisomyType/")
function.dir <- ("/Users/alexis.weber/Desktop/Rfunctions/")

Intialize Functions

source(paste0(function.dir, "plot_baf_allchr_function_v1.R"))
source(paste0(function.dir, "gain_coverage_function_v1.R"))
source(paste0(function.dir, "plot_lrr_seg_function_v1.R"))
source(paste0(function.dir, "estimate_mosaic_function_v1.R"))
source(paste0(function.dir, "plot_baf_mosaic_function_v1.R"))

Data

Load Ts21 Neocortex Donor GT Data

gt <- read.csv("tissueGT_donors_df.csv")[,-1]

Processed Ts21 Neocortex GT Data by Donor

gt %>%
  slice_head(n=10) %>%
  dplyr::select(c(1:9)) %>%
  kbl(caption="Pre-Processed Ts21 GT Data by Donor") %>%
  kable_styling(
    bootstrap_optio = c("hover", "condensed"),
    position = "center",
    html_font = "Arial Narrow") %>%
  kable_paper(full_width=F)
Pre-Processed Ts21 GT Data by Donor
Name Chr Position Donor1.GType Donor1.B.Allele.Freq Donor1.Log.R.Ratio Donor2.GType Donor2.B.Allele.Freq Donor2.Log.R.Ratio
MitoA8870G 1 569418 AA 0.0000000 0.0953033 AA 0.0000000 -0.0046847
rs3131972 1 752721 BB 1.0000000 -0.0248816 AB 0.5722373 -0.0070789
rs11240777 1 798959 AB 0.4435757 0.0391411 AB 0.4942916 -0.0085405
rs4970383 1 838555 BB 1.0000000 0.2787446 BB 1.0000000 0.3886468
rs4475691 1 846808 BB 0.9852062 -0.1684360 BB 1.0000000 -0.3154612
rs13302982 1 861808 AB 0.5258095 -0.2321303 BB 0.9857778 -0.0081681
rs28391282 1 904165 BB 1.0000000 -0.2584022 BB 1.0000000 -0.3457685
rs2341354 1 918573 BB 0.9982755 -0.0512009 AB 0.4674685 -0.0828473
rs9777703 1 928836 AA 0.0075618 0.0184619 AA 0.0000000 0.0325716
rs1891910 1 932457 BB 1.0000000 -0.1466070 AB 0.5296988 -0.0345782

Vectorize GT Data by Donor

Donors

donors <- gt %>%
  dplyr::select(4:ncol(gt)) %>%
  names() %>%
  str_remove("\\..*") %>%
  unique()

Split Genotype Data by Donor

colnames(gt)[1] <- "SNP"

chr_pos_snp <- names(gt)[1:3]

donor_gt_list <- map(set_names(donors), function(don) {
  donor_cols <- names(gt)[str_starts(names(gt), paste0(don, "\\."))]
  gt %>% 
    dplyr::select(all_of(chr_pos_snp), all_of(donor_cols)) %>%
    rename_with(~str_remove(.x, paste0("^", don, "\\.")),
                .cols=all_of(donor_cols))
})

donor_gt_list[[7]] %>%
  slice_head(n=10) %>%
  kbl(caption = paste0("Disomic HSA21 Sample Split GT Data")) %>%
  kable_styling(
    bootstrap_options = c("hover", "condensed"),
    position = "center",
    html_font = "Arial Narrow") %>%
  kable_paper(full_width = F)
Disomic HSA21 Sample Split GT Data
SNP Chr Position GType B.Allele.Freq Log.R.Ratio
MitoA8870G 1 569418 AA 0.0000000 -0.0184992
rs3131972 1 752721 AB 0.4931460 -0.1062123
rs11240777 1 798959 AB 0.4591190 -0.0261659
rs4970383 1 838555 AB 0.5282991 0.3125351
rs4475691 1 846808 BB 0.9850221 -0.2959914
rs13302982 1 861808 BB 1.0000000 -0.1905885
rs28391282 1 904165 BB 1.0000000 -0.2567204
rs2341354 1 918573 AB 0.4884972 -0.1037821
rs9777703 1 928836 AA 0.0000000 0.0389145
rs1891910 1 932457 AB 0.5057924 -0.1999879

Retain only HSA21 and Remove NA

donor_hsa21_list <- lapply(donor_gt_list, function(df) {
  df %>% filter(Chr == "21", !is.na(Log.R.Ratio) & !is.na(B.Allele.Freq))
})

donor_hsa21_list[[7]] %>%
  slice_head(n=10) %>%
  kbl(caption=paste0("Disomic HSA21 Sample Split GT Data")) %>%
  kable_styling(
    bootstrap_options = c("hover", "condensed"),
    position= "center",
    html_font = "Arial Narrow") %>%
  kable_paper(full_width = FALSE)
Disomic HSA21 Sample Split GT Data
SNP Chr Position GType B.Allele.Freq Log.R.Ratio
indel.104379 21 0 AA 0.0054731 -0.0063491
rs10439884 21 10971951 BB 0.9949098 -0.0155799
rs1148737 21 14669931 BB 0.9740081 -0.0529554
rs2801270 21 14670124 AA 0.0396479 0.1131889
kgp12342612 21 14670189 AA 0.0527328 0.3453495
rs3126398 21 14779379 AB 0.4518186 0.1012122
rs16998778 21 14796635 BB 0.8898491 0.0985096
rs469000 21 14827698 AA 0.0530317 0.0302170
rs467489 21 14836389 BB 0.9905307 -0.0516756
kgp13084375 21 14844200 BB 0.9996610 -0.0762860

Log(R) Ratio of HSA21

lrr_plots <- purrr::imap(donor_hsa21_list, ~{
  dn <- .y
  df <- .x

  ggplot(df, aes(x = Position, y = Log.R.Ratio)) +
    geom_point(size = 0.7, alpha = 0.6) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "grey40") +
    geom_hline(yintercept = 0.2, linetype = "dotted", color = "red", linewidth = 0.6) +
    labs(
      title = "Log R Ratio (LRR) across genome",
      x = "Genomic Position",
      y = "Log R Ratio (LRR)"
    ) +
    theme_bw()
})

Disomic Sample HSA21 LRR

lrr_plots[[7]] +
    ggtitle("Disomic HSA21 Sample, HSA21 LRR")

Trisomic Sample HSA21 LRR

lrr_plots[[6]] +
    ggtitle("Trisomic HSA21 Sample, HSA21 LRR")

B-Allele Frequency

baf_plots <- purrr::imap(donor_gt_list, ~{
  dn <- .y
  df <- .x
  
  plot_baf(df)

})

Disomic Sample BAF

baf_plots[[7]] +
    ggtitle("Disomic Sample BAF")
## Warning: Removed 566397 rows containing missing values or values outside the scale range
## (`geom_point()`).

Trisomic Sample BAF

baf_plots[[6]] +
    ggtitle("Trisomic Sample BAF")
## Warning: Removed 566395 rows containing missing values or values outside the scale range
## (`geom_point()`).

Copy Number Analysis Object

Build CNA Object with LRR

Per donor, input GT data: chromosome, position, Log.R.Ratio, data.type = “logratio”, name of the donor.

cna_list <- lapply(seq_along(donor_hsa21_list), function(i) {
  df <- donor_hsa21_list[[i]]
  cna.obj <- CNA(genomdat = df$Log.R.Ratio,
               chrom    = df$Chr,
               maploc   = df$Pos,
               data.type= "logratio",
               sampleid = names(donor_hsa21_list)[i])
  cna.obj
})
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
## Warning in CNA(genomdat = df$Log.R.Ratio, chrom = df$Chr, maploc = df$Pos, : array has repeated maploc positions
names(cna_list) <- names(donor_hsa21_list)

Process CNA Object

Smoothing

Smooth LRR: per probe, set a local window (smooth.region = 10, i.e. 10 probes in either direction) and estimate local center/scale, then flag outliers (outlier.SD.scale=4, i.e. 4 SD units threshold). Outliers are then scaled down (smooth.SD.scale ± 2, i.e. within 2 SD units).

Segmentation

Segment chromosome: per probe, identify changepoints by the largest LRR mean difference between earlier (closer to centromere) or later (closer to q telomere) positions on the chromosome, by maximal t statistic (pvalue < alpha). Within segments, internal changepoints are identified by recursively testing each probe (min.width). Output per segment by the chromosome, segment start, segment end, the number of probes (num.mark) and the mean LRR (seg.mean).

Segment Copy Number

CN per segment is approximated by the mean LRR: greater than 0.15 is approximately 3 copies and less than -0.30 is 1 copy and between these LRR is approximately 2 copies.

seg_df_list <- lapply(cna_list, function(df){
  seg <- segment(smooth.CNA(df), alpha=0.01, min.width=2, verbose=0)
  segdf <- seg$output
  segdf$CN <- with(segdf, ifelse(seg.mean >  0.15, 3,
                        ifelse(seg.mean < -0.30, 1, 2)))
  segdf
})

seg_df_list[[7]] %>%
  slice_head(n=10) %>%
  dplyr::select(2:7) %>%
  kbl(caption = "Disomic HSA21 Sample LRR Segments") %>%
  kable_styling(
    bootstrap_options = c("hover", "condensed"),
    position = "center",
    html_font = "Arial Narrow") %>%
  kable_paper(full_width = F)
Disomic HSA21 Sample LRR Segments
chrom loc.start loc.end num.mark seg.mean CN
21 0 15501522 41 0.0061 2
21 15535590 19689161 502 0.0584 2
21 19695261 24911391 646 0.1202 2
21 24929472 33548880 1007 0.0405 2
21 33553000 42731511 1223 -0.0050 2
21 42739979 48084628 846 -0.0678 2

Segmental Duplication Analysis by LRR

HSA21 Gain Coverage

Summarize the extent of HSA21q that is covered by segments with average LRR > 0.15 or CN =3.

hsa21_gain_prop_list <- lapply(seq_along(donor_hsa21_list), function(i){
  gain_coverage(donor_hsa21_list[[i]], seg_df_list[[i]], "21")
})
names(hsa21_gain_prop_list) <- names(donor_hsa21_list)

HSA21 Gain Coverage

lrr_seg_plot_list <- purrr::pmap(
  list(df1 = donor_hsa21_list, seg = seg_df_list, prop = hsa21_gain_prop_list),
  function(df1, seg, prop){
    gain_df1 <- dplyr::filter(seg, chrom == "21", CN == 3)
    plot_lrr_seg(df1, gain_df1, prop)
  }
)

names(lrr_seg_plot_list) <- names(donor_hsa21_list)

Disomic Sample HSA21 LRR Segment CN Prediction

None of the contiguous segments have an average LRR that constitutes CN =3. Average LRR = 0 across HSA21.

lrr_seg_plot_list[[7]]
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Trisomic Sample HSA21 LRR Segment CN Prediction

Over 50% of the contiguous segments have average LRR consistent with CN = 3 (89%), highlighted in gray. Average LRR > 0 across HSA21.

lrr_seg_plot_list[[6]]

Mosacism

Estimate HSA21 Mosacism with BAF

Pull the BAF of heterozygous SNPs within each segment. Calculate the median BAF for for the lower (AAB) and upper (ABB) heterozygote clusters to determine the extent of trisomy per segment.

The median BAF upper heterozygote cluster

\[ BAF(upper) = (1-m)(0.5) + m(0.67) \]

The median BAF lower heterozygote cluster is equal to the sum of the product of the fraction of diploid mosaic cells and the expected diploid heterozygote BAF (0.5, AB), and the product of the fraction of triploid mosaic cells and the expected lower triploid heterozygote BAF (0.33, AAB).

\[ BAF(lower) = (1-m)(0.5) + m(0.33) \]

Simplified, the median BAF upper heterozygote cluster is equal to the sum of the expected diploid heterozygote BAF (0.5, AB) and the product of 0.17 and the triploid fraction. The median BAF lower heterozygote cluster is equal to the difference between the expected diploid heterozygote BAF (0.5, AB) and the product of 0.17 and the triploid fraction.

\[ BAF(upper) = 0.5 + 0.17m BAF(lower) = 0.5 - 0.17m \]

Simplified to isolate m, the fraction of triploid mosaic cells is equal to the quotient of the difference between median upper heterozygote BAF and the expected diploid heterozygote BAF (0.5, AB) and 0.17. Simplified even further, the fraction of triploid mosaic cells is equal to the product of 5.88 (1/0.17) and the difference between median upper heterozygote BAF and the expected diploid heterozygtoe BAF (0.5, AB)

\[ m = \frac{BAF(upper) - 0.5}{0.17} \] \[ m = 5.88(BAF(upper)-0.5) \]

Establish function that assesses BAF heterozygosity in triplicated regions identified as CN=3 by CNA. A heterozygous BAF will split 0.5 ± m/6 . Therefore m ≈ 6 * (median_upper_band - 0.5) where m is the mosaic fraction of cells with trisomy.

mosaic_out_list <- purrr::map(seq_along(donor_hsa21_list), function(n){

  gt_df  <- donor_hsa21_list[[n]]
  seg_df <- seg_df_list[[n]]

  if (nrow(seg_df) == 0) return(NULL)

  mosaic_tab <- purrr::pmap_dfr(
    list(seg_df$chrom, seg_df$loc.start, seg_df$loc.end),
    ~ estimate_mosaic(gt_df, ..1, ..2, ..3)
  )

  dplyr::bind_cols(
    seg_df %>% dplyr::select(chrom, loc.start, loc.end, num.mark, seg.mean, CN),
    mosaic_tab
  )
})

names(mosaic_out_list) <- names(donor_hsa21_list)

mosaic_out_list[[6]] %>%
  slice_head(n=10) %>%
  kbl(caption = "Trisomic HSA21 Sample Triplicated Fraction") %>%
  kable_styling(
    bootstrap_options = c("hover", "condensed"),
    position = "center",
    html_font = "Arial Narrow") %>%
  kable_paper(full_width = F)
Trisomic HSA21 Sample Triplicated Fraction
chrom loc.start loc.end num.mark seg.mean CN baf_lower baf_upper cn_fraction_m n_het
21 0 15463956 31 0.1666 3 0.3124570 0.6237732 0.728 11
21 15474153 26451776 1354 0.3059 3 0.3229270 0.6502674 0.884 727
21 26452133 28489705 242 0.2173 3 0.3226239 0.6571028 0.924 122
21 28494440 30043959 191 0.2975 3 0.3203439 0.6386812 0.815 113
21 30057145 42711678 1599 0.1925 3 0.3235740 0.6374070 0.808 746
21 42711936 48084628 848 0.1163 2 0.3256005 0.6334760 0.785 369

HSA21 Mosaic Fraction

baf_mos_plot_list <- purrr::pmap(
  list(df1 = donor_hsa21_list, seg = seg_df_list),
  function(df1, seg){
    plot_baf_mosaic(df1, seg)
  }
)

names(baf_mos_plot_list) <- names(donor_hsa21_list)

Disomic Sample HSA21 BAF Distribution

Heterozygote cluster BAF are 50% indicative of two copies of HSA21. Mosaic fraction below 50% in non-trisomic samples confirms disomy.

baf_mos_plot_list[[7]]

Trisomic Sample HSA21 BAF Distriution

Heterozygote cluster BAF at approximately 33% and 66% indicative of three copies of HSA21. Non-Mosaic: over 50% of the HSA21 segments have BAF score indicative of CN = 3 indicative of full trisomy over HSA21.

baf_mos_plot_list[[6]]

Environment

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-apple-darwin20
## Running under: macOS 26.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Detroit
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0  knitr_1.50        DNAcopy_1.78.0    segmented_2.1-4  
##  [5] nlme_3.1-168      MASS_7.3-65       changepoint_2.3   zoo_1.8-14       
##  [9] ggplot2_4.0.1     data.table_1.17.8 purrr_1.2.0       stringr_1.6.0    
## [13] dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.4.0         stringi_1.8.7     
##  [5] lattice_0.22-7     digest_0.6.38      magrittr_2.0.4     evaluate_1.0.5    
##  [9] grid_4.4.1         RColorBrewer_1.1-3 fastmap_1.2.0      jsonlite_2.0.0    
## [13] viridisLite_0.4.2  scales_1.4.0       textshaping_1.0.4  jquerylib_0.1.4   
## [17] cli_3.6.5          rlang_1.1.6        splines_4.4.1      withr_3.0.2       
## [21] cachem_1.1.0       yaml_2.3.10        tools_4.4.1        vctrs_0.6.5       
## [25] R6_2.6.1           lifecycle_1.0.4    pkgconfig_2.0.3    pillar_1.11.1     
## [29] bslib_0.9.0        gtable_0.3.6       glue_1.8.0         systemfonts_1.3.1 
## [33] xfun_0.54          tibble_3.3.0       tidyselect_1.2.1   rstudioapi_0.17.1 
## [37] dichromat_2.0-0.1  farver_2.1.2       htmltools_0.5.8.1  labeling_0.4.3    
## [41] rmarkdown_2.30     svglite_2.2.1      compiler_4.4.1     S7_0.2.1